{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Instalação:\n", "\n", "Para criar o ambiente com as ferramentas necessárias:\n", "\n", "`conda create --name textrepr python=3 scikit-learn numpy scipy gensim nltk pandas jupyter ipython`\n", "\n", "Para ativar o ambiente:\n", "\n", "`source activate lconmeetings`\n", "\n", "`pip install powerlaw`\n", "\n", "Para executar o ambiente de programação, no mesmo diretório do arquivo Text Representations.ipynb digite:\n", "\n", "`jupyter notebook`\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import sklearn.feature_extraction.text as txtfeats\n", "import powerlaw\n", "from nltk import skipgrams" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0datetimescreen_nametext
002016-04-1500:51:27TierroDavidrt revistaepoca dilma diz que se resistir ao i...
112016-04-1500:52:20TierroDavidrt folha grupos pro e contra impeachment convo...
222016-04-1500:54:06TierroDavidestadao estadao irresponsavel patrocinando o g...
332016-04-1500:11:32lisadofloplembrei de alguem com essa foto httpstcoxjwnbo...
442016-04-1501:27:48TierroDavidrt folha ministros sao exonerados para votarem...
\n", "
" ], "text/plain": [ " Unnamed: 0 date time screen_name \\\n", "0 0 2016-04-15 00:51:27 TierroDavid \n", "1 1 2016-04-15 00:52:20 TierroDavid \n", "2 2 2016-04-15 00:54:06 TierroDavid \n", "3 3 2016-04-15 00:11:32 lisadoflop \n", "4 4 2016-04-15 01:27:48 TierroDavid \n", "\n", " text \n", "0 rt revistaepoca dilma diz que se resistir ao i... \n", "1 rt folha grupos pro e contra impeachment convo... \n", "2 estadao estadao irresponsavel patrocinando o g... \n", "3 lembrei de alguem com essa foto httpstcoxjwnbo... \n", "4 rt folha ministros sao exonerados para votarem... " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tweets = pd.read_csv('tweets.csv.gz')\n", "tweets.head()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "bagofwords = txtfeats.CountVectorizer()\n", "bow = bagofwords.fit_transform(tweets.text)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "231602\n" ] }, { "data": { "text/plain": [ "['alemanharoubo',\n", " 'alemanhas',\n", " 'alemao',\n", " 'alemaoe',\n", " 'alemaofernando',\n", " 'alemaomarau',\n", " 'alempara',\n", " 'alencar',\n", " 'alencarinos',\n", " 'alencarmariza']" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(len(bagofwords.get_feature_names()))\n", "bagofwords.get_feature_names()[5000:5010]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "16.223622447244896\n" ] } ], "source": [ "avg = sum(len(t.split()) for t in tweets.text)/len(tweets.text)\n", "print(avg)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "O id de \"alemanharoubo\" é 5000\n", "Total de 231602 atributos.\n" ] } ], "source": [ "print('O id de \"alemanharoubo\" é', bagofwords.vocabulary_.get('alemanharoubo'))\n", "print('Total de {} atributos.'.format(len(bagofwords.vocabulary_)))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " (0, 5002)\t1\n", " (0, 201288)\t1\n", " (0, 220665)\t1\n", " (0, 226539)\t1\n", " (0, 226860)\t2\n", "Índice das palavras presentes: [ 5002 201288 220665 226539 226860]\n", "\n", "O termo \"um\" tem índice 226860\n", "O termo \"novo\" tem índice 201288\n", "O termo \"tweet\" tem índice 226539\n", "O termo \"sobre\" tem índice 220665\n", "O termo \"um\" tem índice 226860\n", "O termo \"alemao\" tem índice 5002\n", "O termo \"kywz\" tem índice None\n", "\n", "A frequência do termo \"um\" no documento é: 2\n", "Transformação inversa de x: ['alemao' 'novo' 'sobre' 'tweet' 'um']\n" ] } ], "source": [ "novo_tweet = 'um novo tweet sobre um alemao kywz'\n", "\n", "print(bagofwords.transform([novo_tweet]))\n", "\n", "x = (bagofwords.transform([novo_tweet])\n", " .toarray()\n", " )\n", "\n", "print('Índice das palavras presentes: ', x.nonzero()[1])\n", "print()\n", "\n", "for t in novo_tweet.split():\n", " print('O termo \"{}\" tem índice {}'.format(t, bagofwords.vocabulary_.get(t)))\n", "\n", "print()\n", "print('A frequência do termo \"um\" no documento é: ', x[0,226860])\n", "\n", "print('Transformação inversa de x: ', bagofwords.inverse_transform(x)[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Outros parâmetros\n", "\n", "`strip_accents : {‘ascii’, ‘unicode’, None}`: remove acentos das palavras com codificação ascii ou unicode (None por padrão).\n", "\n", "`stop_words : string {‘english’}, list, or None (default)`: permite usar uma lista de *stop words* (None por padrão).\n", "\n", "`lowercase : boolean, True by default`: transforma as letras em caixa baixa (padrão).\n", "\n", "`binary : boolean, default=False`: não contabiliza a frequência dos termos (Falso por padrão)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Luhn cut-off\n", "\n", "`max_df : float in range [0.0, 1.0] or int, default=1.0`: remove os termos com frequência de documento maior que N.\n", "\n", "`min_df : float in range [0.0, 1.0] or int, default=1`: remove os termos com frequência de documento menor que N.\n", "\n", "`max_features : int or None, default=None`: seleciona os N atributos mais frequentes." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Calculating best minimal value for power law fit\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "PL? (or EXP): True\n", "alpha = 2.00175723834\n", "df_min = 106 df_max = 11172\n" ] } ], "source": [ "df = sorted(bow.sum(axis=0).A1)\n", "fit = powerlaw.Fit(df)\n", "cmp = fit.distribution_compare('power_law', 'exponential')\n", "df_max = int(np.round(np.power(fit.xmin*np.power(2, 1/(fit.power_law.alpha-1)),4/3)))\n", "df_min = int(np.round(np.power(fit.xmin*np.power(2, 1/(fit.power_law.alpha-1)),2/3)))\n", "\n", "print('PL? (or EXP): ', cmp[1]<0.05)\n", "print('alpha = ', fit.power_law.alpha)\n", "print('df_min = {} df_max = {}'.format(df_min, df_max))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total de 5333 atributos.\n" ] } ], "source": [ "bagofwords = txtfeats.CountVectorizer(min_df=df_min, max_df=df_max, \n", " strip_accents='unicode', stop_words=['e', 'ou', 'ele', 'ela'])\n", "bow = bagofwords.fit_transform(tweets.text)\n", "print('Total de {} atributos.'.format(len(bagofwords.vocabulary_)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## N-grams\n", "\n", "`analyzer : string, {‘word’, ‘char’, ‘char_wb’}`: se os n-grams serão determinados por palavras, por caracteres ou por caracteres dentro das palavras\n", "\n", "`ngram_range : tuple (min_n, max_n)`: gera n-gramas com min_n <= n <= max_n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total de 10048 atributos.\n" ] } ], "source": [ "bagofwords = txtfeats.CountVectorizer(min_df=df_min, max_df=df_max, \n", " strip_accents='unicode', ngram_range=(2,2))\n", "bow = bagofwords.fit_transform(tweets.text)\n", "print('Total de {} atributos.'.format(len(bagofwords.vocabulary_)))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tfidf = txtfeats.TfidfVectorizer()\n", "bow = tfidf.fit_transform(tweets.text)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "norm : ‘l1’, ‘l2’ or None, optional\n", "\n", "Norm used to normalize term vectors. None for no normalization.\n", "\n", "use_idf : boolean, default=True Enable inverse-document-frequency reweighting.\n", "\n", "smooth_idf : boolean, default=True\n", "Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once. Prevents zero divisions.\n", "\n", "sublinear_tf : boolean, default=False Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Skipgram\n", "\n", "`tfidf = txtfeats.TfidfVectorizer(tokenizer=skipgram)`" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total de 38330 atributos.\n" ] } ], "source": [ "skip3n2grams = lambda s: skipgrams(s.split(), 2, 3)\n", "tfidf = txtfeats.TfidfVectorizer(tokenizer=skip3n2grams, \n", " min_df=df_min, max_df=df_max)\n", "bow = tfidf.fit_transform(tweets.text)\n", "print('Total de {} atributos.'.format(len(tfidf.vocabulary_)))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[('acovardado', 'mas'),\n", " ('acovardado', 'na'),\n", " ('acovardado', 'o'),\n", " ('acovardado', 'obriga'),\n", " ('acovardado', 'permitira'),\n", " ('acovardado', 'poder'),\n", " ('acovardado', 'povo'),\n", " ('acovardado', 'so'),\n", " ('acovardado', 'stf'),\n", " ('acovardado', 'tom')]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tfidf.get_feature_names()[10000:10010]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# MinHash" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "collapsed": false }, "outputs": [], "source": [ "P = 109297\n", "nhashes = 3\n", "alphas = np.random.randint(1, P-1, (nhashes,))\n", "betas = np.random.randint(1, P-1, (nhashes,))\n", "\n", "H = [list(map(hash,t.split())) for t in tweets.text[:100]]\n", "MH = np.zeros((100,nhashes))\n", "\n", "for i, h in enumerate(H):\n", " for j in range(nhashes):\n", " MH[i,j] = np.min(np.remainder(alphas[j]*h + betas[j], P))" ] }, { "cell_type": "code", "execution_count": 95, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 1.38460000e+04, 4.10000000e+01, 2.00000000e+01],\n", " [ 1.53650000e+04, 8.05300000e+03, 9.03000000e+02],\n", " [ 1.97050000e+04, 2.24230000e+04, 5.95900000e+03],\n", " [ 1.44010000e+04, 4.05370000e+04, 4.59300000e+03],\n", " [ 1.45870000e+04, 8.05300000e+03, 2.93200000e+03],\n", " [ 1.36300000e+03, 6.01600000e+03, 5.33700000e+03],\n", " [ 1.80000000e+03, 8.70800000e+03, 4.59300000e+03],\n", " [ 1.92120000e+04, 3.05900000e+03, 4.59300000e+03],\n", " [ 1.60000000e+02, 1.29300000e+04, 4.59300000e+03],\n", " [ 3.04200000e+03, 8.70800000e+03, 2.46940000e+04]])" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "MH[:10,:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Doc2Vec" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from gensim.models import doc2vec\n", "\n", "class LabeledLineSentence(object):\n", " def __init__(self, stream):\n", " self.stream = stream\n", " def __iter__(self):\n", " for uid, line in enumerate(self.stream):\n", " yield doc2vec.LabeledSentence(words=line.split(), tags=['SENT_%s' % uid])" ] }, { "cell_type": "code", "execution_count": 70, "metadata": { "collapsed": false }, "outputs": [], "source": [ "model = doc2vec.Doc2Vec(LabeledLineSentence(tweets.text.values[:1000]))" ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X = model.docvecs" ] }, { "cell_type": "code", "execution_count": 74, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([-0.00484932, -0.00559254, 0.00104118, 0.00191504, -0.00015034,\n", " 0.00358489, -0.00116596, 0.00536888, 0.00067337, 0.00054678], dtype=float32)" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X[0][:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda env:lconmeetings]", "language": "python", "name": "conda-env-lconmeetings-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }